### KEY PARAMETERS
library(BiocStyle)
# Cell QC - Day 0
min_total_counts_0 <- 750
max_total_counts_0 <- 7500
min_total_genes_0 <- 325
max_total_genes_0 <- 2000
max_pct_mt_expression_0 <- 17.5
# Cell QC - Day 2
min_total_counts_2 <- 650
max_total_counts_2 <- 15000
min_total_genes_2 <- 325
max_total_genes_2 <- 4000
max_pct_mt_expression_2 <- 17.5
# Cell QC - Day 0 (rep2)
min_total_counts_0_rep2 <- 1250
max_total_counts_0_rep2 <- 10000
min_total_genes_0_rep2 <- 325
max_total_genes_0_rep2 <- 4000
max_pct_mt_expression_0_rep2 <- 25
# Cell QC - Day 1 (rep2)
min_total_counts_1_rep2 <- 800
max_total_counts_1_rep2 <- 15000
min_total_genes_1_rep2 <- 325
max_total_genes_1_rep2 <- 4000
max_pct_mt_expression_1_rep2 <-17.5
# Gene QC
min_total_cells <- 10
We are investigating whether culturing cells after thawing them eliminates the bias introduced by sampling time. The objective of this notebook is to filter out poor-quality cells and genes and normalize gene expression counts.
library(scater)
library(scran)
library(Seurat)
library(ggpubr)
library(ggridges)
library(DoubletFinder)
library(tidyverse)
source("bin/utils.R")
We load the demuliplexed Seurat objects:
t_act_l <- readRDS("results/R_objects/t_act_Seurat_list_demultiplexed.rds")
To increase our resolution, we will already rule out the cells that were labeled as “Negative”. For now, we will keep the detected Doublets as we will use them as ground-truth in the doublet detection step below:
t_act_l <- map(t_act_l, function(seurat) {
seurat <- subset(seurat, subset = hash.ID != "Negative")
})
There are 3 essential quality control metrics that will determine if we include or exclude a cell:
The only metric missing is the mt expression:
t_act_l <- map(t_act_l, function(seurat) {
seurat[["percent_mt"]] <- PercentageFeatureSet(seurat, pattern = "^MT-")
seurat
})
A first important realization is that different libraries can have differences in QC metric distributions. For instance, if one library was sequenced deeper than another, then the notion of ‘poor-quality cell’ between both based on QC metrics will differ. If that is the case, we will need to establish different thresholds for each condition.
Let us inspect such distributions:
vars <- c("nFeature_RNA", "percent_mt")
y_titles <- c("# detected genes", "% mitochondrial expression")
qc_distr_gg <- map2(vars, y_titles, function(var, y_title) {
t_act_l %>%
map(~ .x@meta.data) %>%
bind_rows(.id = "day") %>%
mutate(day = str_remove(day, "Tcell_activation_")) %>%
separate(col = "hash.ID", into = c("donor", "time", "temperature", "day"), sep = "-") %>%
filter(!is.na(time)) %>%
mutate(time = factor(time, levels = c("0h", "8h", "24h"))) %>%
ggplot(aes_string("time", var, fill = "time")) +
geom_boxplot() +
facet_grid(day ~ donor) +
labs(x = "", y = y_title) +
theme_bw() +
theme(legend.position = "none")
})
qc_distr_gg
## [[1]]
##
## [[2]]
Another important exploratory analysis is the joint distribution between the 3 metrics. Specially, a high mitochondrial expression can also be present in metabolically active cells. Therefore, we need to assess how this covaries with total counts. If we find that most cells with large mitochondrial activity also have few genes/UMI, then we can be certain that they are of poor-quality:
qc_titles <- c("Library Size (total UMI)", "Number of detected genes", "% mitochondrial expression")
joint_qc <- map(t_act_l, function(seurat) {
joint_qc_gg <- seurat@meta.data %>%
ggplot(aes(nCount_RNA, nFeature_RNA, color = percent_mt)) +
geom_point(alpha = 0.5) +
scale_color_viridis_c() +
labs(x = qc_titles[1], y = qc_titles[2], color = qc_titles[3]) +
theme_classic()
joint_qc_gg
})
joint_qc
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
Now that we have a better appreciation for our data, we can proceed to decide on the thresholds for each metric. We will do so by plotting a histogram for each metric:
qc_colnames <- c("nCount_RNA", "nFeature_RNA", "percent_mt")
hist_counts_gg1 <- map2(t_act_l, c(10000, 20000, 15000, 15000), function(seurat, x_max) {
hist_counts_gg1 <- plot_histogram_seurat(
seurat,
qc_metric = qc_colnames[1],
limits = c(0, x_max),
title = qc_titles[1],
log_scale = FALSE
)
hist_counts_gg1
})
hist_counts_gg1
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
We see a right-skewed distribution, which likely encapsulates the increased transcription after T-cell activation. We can increase the resolution:
hist_counts_gg2 <- map(t_act_l, function(seurat) {
hist_counts_gg2 <- plot_histogram_seurat(
seurat,
qc_metric = qc_colnames[1],
limits = c(0, 4000),
title = qc_titles[1],
log_scale = FALSE
)
hist_counts_gg2
})
hist_counts_gg2
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
For day 0 we will threshold at 750, whilst for day 2 we will threshold at 650. Moreover we will require a maximum of 7500 and 1.510^{4} for days 0 and 2, respectively:
iterable <- list(
hist_counts_gg2,
c(min_total_counts_0, min_total_counts_2, min_total_counts_0_rep2, min_total_counts_1_rep2),
c(max_total_counts_0, max_total_counts_2, max_total_counts_0_rep2, max_total_counts_1_rep2)
)
hist_counts_gg3 <- pmap(iterable, function(hist, min_counts, max_counts) {
hist +
geom_vline(xintercept = min_counts, linetype = "dashed", color = "red") +
geom_vline(xintercept = max_counts, linetype = "dashed", color = "red")
})
hist_counts_gg3
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
hist_n_genes_gg1 <- map2(t_act_l, c(4000, 8000, 6000, 6000), function(seurat, x_max) {
plot_histogram_seurat(
seurat,
qc_metric = qc_colnames[2],
limits = c(0, x_max),
title = qc_titles[2],
log_scale = FALSE
)
})
hist_n_genes_gg1
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
For day 0 we see two modes. They likely respresent two subpopulations, so we will choose to be permissive and filter out cells with less than 325 or more than 2000. On the other hand, day 0 possesses a large right tail. We rule out cells with less than 325or more than 4000:
iterable <- list(
hist_n_genes_gg1,
c(min_total_genes_0, min_total_genes_2, min_total_genes_0_rep2, min_total_genes_1_rep2),
c(max_total_genes_0, max_total_genes_2, max_total_genes_0_rep2, max_total_genes_1_rep2)
)
hist_n_genes_gg2 <- pmap(iterable, function(hist, min_counts, max_counts) {
hist +
geom_vline(xintercept = min_counts, linetype = "dashed", color = "red") +
geom_vline(xintercept = max_counts, linetype = "dashed", color = "red")
})
hist_n_genes_gg2
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
mt_hists <- map(t_act_l, function(seurat) {
mt_hist <- plot_histogram_seurat(
seurat,
qc_metric = qc_colnames[3],
limits = c(0, 100),
title = qc_titles[3],
log_scale = FALSE
)
mt_hist
})
mt_hists
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
We see a long tail of cells with high mitochondrial expression. As we saw that high % mt expression correlated with low number of genes, we will visually follow the normal distribution and threshold for day 0 and 2 at 17.5 and 17.5, respectively:
iterable <- c(max_pct_mt_expression_0, max_pct_mt_expression_2, max_pct_mt_expression_0_rep2, max_pct_mt_expression_1_rep2)
mt_hists <- map2(mt_hists, iterable, function(p, x_max) {
p +
geom_vline(xintercept = x_max, linetype = "dashed", color = "red")
})
mt_hists
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
Overall, we will consider as poor-quality any cell that satisfies any of the following conditions:
DAY 0 (rep1):
is_poor_quality_0 <-
t_act_l$Tcell_activation_day0$nCount_RNA < min_total_counts_0 |
t_act_l$Tcell_activation_day0$nCount_RNA > max_total_counts_0 |
t_act_l$Tcell_activation_day0$nFeature_RNA < min_total_genes_0 |
t_act_l$Tcell_activation_day0$nFeature_RNA > max_total_genes_0 |
t_act_l$Tcell_activation_day0$percent_mt > max_pct_mt_expression_0
table(is_poor_quality_0)
## is_poor_quality_0
## FALSE TRUE
## 26562 1671
DAY 2 (rep1):
is_poor_quality_2 <-
t_act_l$Tcell_activation_day2$nCount_RNA < min_total_counts_2 |
t_act_l$Tcell_activation_day2$nCount_RNA > max_total_counts_2 |
t_act_l$Tcell_activation_day2$nFeature_RNA < min_total_genes_2 |
t_act_l$Tcell_activation_day2$nFeature_RNA > max_total_genes_2 |
t_act_l$Tcell_activation_day2$percent_mt > max_pct_mt_expression_2
table(is_poor_quality_2)
## is_poor_quality_2
## FALSE TRUE
## 7135 1964
DAY 0 (rep2):
is_poor_quality_0_rep2 <-
t_act_l$Tcell_activation_day0_rep2$nCount_RNA < min_total_counts_0_rep2 |
t_act_l$Tcell_activation_day0_rep2$nCount_RNA > max_total_counts_0_rep2 |
t_act_l$Tcell_activation_day0_rep2$nFeature_RNA < min_total_genes_0_rep2 |
t_act_l$Tcell_activation_day0_rep2$nFeature_RNA > max_total_genes_0_rep2 |
t_act_l$Tcell_activation_day0_rep2$percent_mt > max_pct_mt_expression_0_rep2
table(is_poor_quality_0_rep2)
## is_poor_quality_0_rep2
## FALSE TRUE
## 12438 1260
DAY 1 (rep2):
is_poor_quality_1_rep2 <-
t_act_l$Tcell_activation_day1_rep2$nCount_RNA < min_total_counts_1_rep2 |
t_act_l$Tcell_activation_day1_rep2$nCount_RNA > max_total_counts_1_rep2 |
t_act_l$Tcell_activation_day1_rep2$nFeature_RNA < min_total_genes_1_rep2 |
t_act_l$Tcell_activation_day1_rep2$nFeature_RNA > max_total_genes_1_rep2 |
t_act_l$Tcell_activation_day1_rep2$percent_mt > max_pct_mt_expression_1_rep2
table(is_poor_quality_1_rep2)
## is_poor_quality_1_rep2
## FALSE TRUE
## 10548 1108
Of note, we can compare the proportion of poor quality cells across donors:
t_act_l$Tcell_activation_day0$is_low_quality <- is_poor_quality_0
t_act_l$Tcell_activation_day2$is_low_quality <- is_poor_quality_2
t_act_l$Tcell_activation_day0_rep2$is_low_quality <- is_poor_quality_0_rep2
t_act_l$Tcell_activation_day1_rep2$is_low_quality <- is_poor_quality_1_rep2
map(t_act_l, function(seurat) {
seurat@meta.data %>%
separate(col = "hash.ID", into = c("donor", "time", "temperature", "day"), sep = "-") %>%
filter(!is.na(day)) %>%
mutate(time = factor(time, levels = c("0h", "8h", "24h"))) %>%
group_by(donor, time) %>%
summarise(pct_low_quality = mean(is_low_quality) * 100) %>%
ggplot(aes(time, pct_low_quality, fill = time)) +
geom_col() +
labs(y = "% low-quality cells") +
facet_wrap(~donor) +
theme_bw() +
theme(legend.position = "none")
})
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
In light of the above, we will discard the following cells:
t_act_l <- map(t_act_l, function(seurat) {
seurat_sub <- subset(seurat, subset = is_low_quality == FALSE)
seurat_sub
})
t_act_l
## $Tcell_activation_day0
## An object of class Seurat
## 33544 features across 26562 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
##
## $Tcell_activation_day2
## An object of class Seurat
## 33544 features across 7135 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
##
## $Tcell_activation_day0_rep2
## An object of class Seurat
## 33544 features across 12438 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
##
## $Tcell_activation_day1_rep2
## An object of class Seurat
## 33544 features across 10548 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
From the cellranger’s web summary report and the demultiplexing of the hashtag oligonucleotides (HTO), we know that day 0 contained an excessive amount of cells. This suggests that we overloaded the cellranger lane and, although we can detect and remove most doublets thanks to the cell hashing, the homotypic doublets (those that share hashtag) still remain. Thus, we will try to predict and remove them with DoubletFinder.
DoubetFinder simulates doublets by averaging the UMI of two real cells. It subsequently project the simulations and real cells in PCA space and computes a nearest neighbors graph. Doublets are identified as those cells that have a proportion of artificial neighbors (pAN) greater than what you would expect by chance. We need to consider the following steps:
dr <- map_dbl(t_act_l, ~ 0.0008 * ncol(.x))
nExp_poi <- map2_dbl(dr, t_act_l, ~ .x * ncol(.y) / 100)
possible_pk <- c(
seq(0.001, 0.009, by = 0.001),
seq(0.01, 0.09, by = 0.01),
0.1, 0.15, 0.2, 0.25, 0.3
)
tprs_l <- map2(t_act_l, nExp_poi, function(seurat, nExp) {
seurat$doublets_hashing <- ifelse(seurat$hash.ID == "Doublet", "Doublet", "Singlet")
seurat <- pre_process_seurat(seurat)
tprs <- map_dbl(possible_pk, function(pK) {
seurat <- doubletFinder_v3(
seu = seurat,
PCs = 1:10,
pN = 0.25,
pK = pK,
nExp = nExp,
reuse.pANN = FALSE,
sct = FALSE
)
doublet_finder <- str_c("DF.classifications_0.25", pK, nExp, sep = "_")
tpr <- sum(seurat@meta.data[, doublet_finder] == "Doublet" & seurat$doublets_hashing == "Doublet") /
sum(seurat$doublets_hashing == "Doublet") * 100
tpr
})
tprs
})
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
# saveRDS(tprs_l, "results/R_objects/true_positive_rates_doublets.rds")
tprs_l
## $Tcell_activation_day0
## [1] 29.42865 29.27211 29.26341 29.28950 29.24602 29.21993 29.16775 29.02861 29.07209 29.00252 29.03731 29.01122 28.59379 28.52422 28.65467 28.71554 28.80250 28.79381 28.74163 28.88077 28.96774 29.06340 29.19384
##
## $Tcell_activation_day2
## [1] 14.255319 13.829787 14.255319 14.468085 13.723404 13.723404 13.085106 13.510638 13.191489 13.404255 12.446809 12.127660 11.702128 11.808511 11.489362 11.489362 11.702128 11.063830 11.170213 10.212766 10.744681 10.212766 9.893617
##
## $Tcell_activation_day0_rep2
## [1] 25.18033 24.81967 24.91803 24.68852 24.91803 24.81967 24.72131 24.55738 24.39344 24.39344 23.70492 23.14754 22.72131 21.01639 20.55738 20.49180 20.42623 20.09836 19.73770 18.52459 17.44262 17.40984 17.50820
##
## $Tcell_activation_day1_rep2
## [1] 34.76395 35.62232 36.05150 35.96567 35.87983 35.87983 35.45064 35.45064 35.45064 35.53648 33.99142 33.90558 33.04721 32.27468 31.67382 30.98712 31.07296 30.47210 30.12876 29.35622 28.75536 28.49785 28.49785
We can choose the pK that maximizes TPR:
pk_opt <- map_dbl(tprs_l, function(x) {
df <- data.frame(pk = possible_pk, tpr = x)
df_max <- df[which.max(df$tpr), ]
print(ggplot(df, aes(pk, tpr)) +
geom_point(color = "blue") +
geom_line(color = "blue") +
geom_text(data = df_max, aes(label = pk), color = "black") +
labs(x = "pK", y = "True Positive Rate (%)") +
theme_classic()
)
df_max[, "pk"]
})
pk_opt
## Tcell_activation_day0 Tcell_activation_day2 Tcell_activation_day0_rep2 Tcell_activation_day1_rep2
## 0.001 0.004 0.001 0.003
Then, let us run DobuletFinder with the optimal pK:
t_act_l <- pmap(list(t_act_l, pk_opt, nExp_poi), function(seurat, pk, nExp) {
seurat <- pre_process_seurat(seurat)
seurat <- doubletFinder_v3(
seu = seurat,
PCs = 1:10,
pN = 0.25,
pK = pk,
nExp = nExp,
reuse.pANN = FALSE,
sct = FALSE
)
seurat
})
## [1] "Creating 8854 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 2378 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 4146 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Creating 3516 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
Finally, we can visualize the predictions:
t_act_l <- purrr::map(t_act_l, function(seurat) {
doublet_col <- str_subset(colnames(seurat@meta.data), "^DF")
doublet_finder <- seurat@meta.data[, doublet_col]
seurat$doublet_hashing <- ifelse(seurat$hash.ID == "Doublet", "Doublet", "Singlet")
table(doublet_finder = doublet_finder, doublet_hashing = seurat$doublet_hashing)
seurat$doublet <- case_when(
doublet_finder == "Doublet" & seurat$doublet_hashing == "Doublet" ~ "Doublet 2X",
doublet_finder == "Doublet" & seurat$doublet_hashing == "Singlet" ~ "Doublet Finder",
doublet_finder == "Singlet" & seurat$doublet_hashing == "Doublet" ~ "Doublet Hashing",
doublet_finder == "Singlet" & seurat$doublet_hashing == "Singlet" ~ "Singlet"
)
seurat
})
purrr::map(t_act_l, ~ table(.x$doublet))
## $Tcell_activation_day0
##
## Doublet 2X Doublet Finder Doublet Hashing Singlet
## 3384 2260 8115 12803
##
## $Tcell_activation_day2
##
## Doublet 2X Doublet Finder Doublet Hashing Singlet
## 124 283 816 5912
##
## $Tcell_activation_day0_rep2
##
## Doublet 2X Doublet Finder Doublet Hashing Singlet
## 768 469 2282 8919
##
## $Tcell_activation_day1_rep2
##
## Doublet 2X Doublet Finder Doublet Hashing Singlet
## 420 470 745 8913
umaps_doublets <- purrr::map(t_act_l, function(seurat) {
Idents(seurat) <- "doublet"
DimPlot(
seurat,
reduction = "umap",
pt.size = 0.6,
cols = c("#CD2C14", "#4FAA52", "#1A40AD", "#EEE2B9")
)
})
umaps_doublets
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
We can see that the doublets detected by hashing are mostly within a cluster, while the ones detected by DoubletFinder are mostly between clusters. This is consistent with the fact that DoubletFinder fails at detecting doublets with similar transcriptional states. All in all, both methods are complementary and allow us to remove the bulk of the doublets in the dataset.
We can proceed to filter them out:
t_act_l <- purrr::map(t_act_l, function(seurat) {
Idents(seurat) <- "doublet"
seurat <- subset(seurat, idents = "Singlet")
seurat
})
t_act_l
## $Tcell_activation_day0
## An object of class Seurat
## 33544 features across 12803 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
## 3 dimensional reductions calculated: pca, tsne, umap
##
## $Tcell_activation_day2
## An object of class Seurat
## 33544 features across 5912 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
## 3 dimensional reductions calculated: pca, tsne, umap
##
## $Tcell_activation_day0_rep2
## An object of class Seurat
## 33544 features across 8919 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
## 3 dimensional reductions calculated: pca, tsne, umap
##
## $Tcell_activation_day1_rep2
## An object of class Seurat
## 33544 features across 8913 samples within 2 assays
## Active assay: RNA (33538 features)
## 1 other assay present: HTO
## 3 dimensional reductions calculated: pca, tsne, umap
Let us compute, for each gene, the number of cells in which we can detect at least 1 UMI:
gene_qc_gg <- purrr::map(t_act_l, function(seurat) {
n_cells <- rowSums(as.matrix(seurat[["RNA"]]@counts) > 0)
n_cells %>%
as.data.frame() %>%
ggplot(aes(n_cells)) +
geom_histogram(bins = 100, alpha = 0.75) +
scale_x_log10("Number of cells") +
theme_bw()
})
gene_qc_gg
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
We see two peaks, the first one of which corresponds to lowly expressed genes. As explained in Luecken MD et al.: “a guideline to setting this threshold is to use the minimum cell cluster size that is of interest and leaving some leeway for dropout effects”. As we will not rely on clusters that have fewer than 10 cells, we will use it as a filter:
purrr::map(gene_qc_gg, function(p) {
p +
geom_vline(xintercept = min_total_cells, color = "red", linetype = "dashed")
})
## $Tcell_activation_day0
##
## $Tcell_activation_day2
##
## $Tcell_activation_day0_rep2
##
## $Tcell_activation_day1_rep2
t_act_sce <- purrr::map(t_act_l, function(seurat) {
n_cells <- rowSums(as.matrix(seurat[["RNA"]]@counts) > 0)
sce <- as.SingleCellExperiment(seurat)
sce <- sce[n_cells > min_total_cells, ]
sce
})
t_act_sce
## $Tcell_activation_day0
## class: SingleCellExperiment
## dim: 14017 12803
## metadata(0):
## assays(2): counts logcounts
## rownames(14017): AL669831.5 LINC00115 ... AC004556.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(12803): AAACCCAAGCATTTGC-1 AAACCCAAGCTCTGTA-1 ... TTTGTTGTCGGTATGT-1 TTTGTTGTCTGTAAGC-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
##
## $Tcell_activation_day2
## class: SingleCellExperiment
## dim: 13366 5912
## metadata(0):
## assays(2): counts logcounts
## rownames(13366): AL669831.5 LINC00115 ... AC004556.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(5912): AAACCCAAGAGGCCAT-1 AAACCCAAGGAGAGTA-1 ... TTTGTTGCACTAACGT-1 TTTGTTGCATTCCTAT-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
##
## $Tcell_activation_day0_rep2
## class: SingleCellExperiment
## dim: 14085 8919
## metadata(0):
## assays(2): counts logcounts
## rownames(14085): AL669831.5 LINC00115 ... AC004556.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(8919): AAACCCAAGACGGAAA-1 AAACCCAAGAGAGCCT-1 ... TTTGTTGGTTACAGCT-1 TTTGTTGGTTGTATGC-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
##
## $Tcell_activation_day1_rep2
## class: SingleCellExperiment
## dim: 13600 8913
## metadata(0):
## assays(2): counts logcounts
## rownames(13600): AL669831.5 LINC00115 ... AL354822.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(8913): AAACCCAAGTCTGCAT-1 AAACCCACACGAGGAT-1 ... TTTGTTGCAGTTAGAA-1 TTTGTTGCATCATGAC-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
To confidently compare gene expression between cells, we need to correct for two biases:
To correct for both, we will use the scran package which according to both a recent and an old review is the most robust method for scRNA-seq data normalization:
t_act_sce <- purrr::map(t_act_sce, function(sce) {
sce <- computeSumFactors(sce)
print(summary(sizeFactors(sce)))
sce <- normalize(sce)
print(assays(sce))
logcounts(sce)[1:6, 1:6]
sce
})
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2454 0.5807 0.8374 1.0000 1.3090 4.0591
## List of length 2
## names(2): counts logcounts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09527 0.31494 0.62712 1.00000 1.48513 4.19161
## List of length 2
## names(2): counts logcounts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2174 0.6759 0.8286 1.0000 1.1924 3.7522
## List of length 2
## names(2): counts logcounts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1079 0.4207 0.9365 1.0000 1.4529 3.0481
## List of length 2
## names(2): counts logcounts
t_act_sce
## $Tcell_activation_day0
## class: SingleCellExperiment
## dim: 14017 12803
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(14017): AL669831.5 LINC00115 ... AC004556.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(12803): AAACCCAAGCATTTGC-1 AAACCCAAGCTCTGTA-1 ... TTTGTTGTCGGTATGT-1 TTTGTTGTCTGTAAGC-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
##
## $Tcell_activation_day2
## class: SingleCellExperiment
## dim: 13366 5912
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(13366): AL669831.5 LINC00115 ... AC004556.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(5912): AAACCCAAGAGGCCAT-1 AAACCCAAGGAGAGTA-1 ... TTTGTTGCACTAACGT-1 TTTGTTGCATTCCTAT-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
##
## $Tcell_activation_day0_rep2
## class: SingleCellExperiment
## dim: 14085 8919
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(14085): AL669831.5 LINC00115 ... AC004556.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(8919): AAACCCAAGACGGAAA-1 AAACCCAAGAGAGCCT-1 ... TTTGTTGGTTACAGCT-1 TTTGTTGGTTGTATGC-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
##
## $Tcell_activation_day1_rep2
## class: SingleCellExperiment
## dim: 13600 8913
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(13600): AL669831.5 LINC00115 ... AL354822.1 AC240274.1
## rowData names(5): vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
## colnames(8913): AAACCCAAGTCTGCAT-1 AAACCCACACGAGGAT-1 ... TTTGTTGCAGTTAGAA-1 TTTGTTGCATCATGAC-1
## colData names(18): orig.ident nCount_RNA ... doublet ident
## reducedDimNames(3): PCA TSNE UMAP
## spikeNames(0):
## altExpNames(0):
Finally, we can convert it back to seurat and save it as a compressed .RDS file for future analysis:
# Recompute hash.ID variable
t_act_seu <- purrr::map(t_act_sce, function(sce) {
seurat <- as.Seurat(sce)
seurat@meta.data <- seurat@meta.data %>%
separate(col = "hash.ID", into = c("donor", "time", "temperature", "day"), sep = "-")
seurat
})
names(t_act_seu) <- c("day_0_rep1", "day_2_rep1", "day_0_rep2", "day_1_rep2")
# saveRDS(t_act_seu, "results/R_objects/t_act_Seurat_list_filtered_normalized.rds")
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] KernSmooth_2.23-16 fields_10.3 maps_3.3.0 spam_2.5-1 dotCall64_1.0-0 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.4 purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.3 tidyverse_1.3.0 DoubletFinder_2.0.2 ggridges_0.5.2 ggpubr_0.2.5 magrittr_1.5 Seurat_3.1.4 scran_1.14.6 scater_1.14.6 ggplot2_3.3.0 SingleCellExperiment_1.8.0 SummarizedExperiment_1.16.1 DelayedArray_0.12.2 BiocParallel_1.20.1 matrixStats_0.55.0 Biobase_2.46.0 GenomicRanges_1.38.0 GenomeInfoDb_1.22.0 IRanges_2.20.2 S4Vectors_0.24.3 BiocGenerics_0.32.0 BiocStyle_2.14.4
##
## loaded via a namespace (and not attached):
## [1] reticulate_1.14 tidyselect_1.0.0 htmlwidgets_1.5.1 Rtsne_0.15 munsell_0.5.0 codetools_0.2-16 mutoss_0.1-12 ica_1.0-2 statmod_1.4.34 future_1.16.0 withr_2.1.2 colorspace_1.4-1 knitr_1.28 rstudioapi_0.11 ROCR_1.0-7 ggsignif_0.6.0 gbRd_0.4-11 listenv_0.8.0 Rdpack_0.11-1 labeling_0.3 GenomeInfoDbData_1.2.2 mnormt_1.5-6 farver_2.0.3 vctrs_0.2.3 generics_0.0.2 TH.data_1.0-10 xfun_0.12 R6_2.4.1 ggbeeswarm_0.6.0 rsvd_1.0.3 locfit_1.5-9.1 bitops_1.0-6 assertthat_0.2.1 scales_1.1.0 multcomp_1.4-12 beeswarm_0.2.3 gtable_0.3.0 npsurv_0.4-0 globals_0.12.5 sandwich_2.5-1 rlang_0.4.5 splines_3.6.1 lazyeval_0.2.2 broom_0.5.5 BiocManager_1.30.10 yaml_2.2.1 reshape2_1.4.3
## [48] modelr_0.1.6 backports_1.1.5 tools_3.6.1 bookdown_0.18 ellipsis_0.3.0 gplots_3.0.3 RColorBrewer_1.1-2 TFisher_0.2.0 Rcpp_1.0.3 plyr_1.8.6 zlibbioc_1.32.0 RCurl_1.98-1.1 pbapply_1.4-2 viridis_0.5.1 cowplot_1.0.0 zoo_1.8-7 haven_2.2.0 ggrepel_0.8.1 cluster_2.1.0 fs_1.3.2 RSpectra_0.16-0 magick_2.3 data.table_1.12.8 lmtest_0.9-37 reprex_0.3.0 RANN_2.6.1 mvtnorm_1.1-0 fitdistrplus_1.0-14 hms_0.5.3 patchwork_1.0.0 lsei_1.2-0 evaluate_0.14 readxl_1.3.1 gridExtra_2.3 compiler_3.6.1 crayon_1.3.4 htmltools_0.4.0 RcppParallel_4.4.4 lubridate_1.7.4 DBI_1.1.0 dbplyr_1.4.2 MASS_7.3-51.5 Matrix_1.2-18 cli_2.0.2 gdata_2.18.0 metap_1.3 igraph_1.2.4.2
## [95] pkgconfig_2.0.3 sn_1.5-5 numDeriv_2016.8-1.1 plotly_4.9.2 xml2_1.2.2 vipor_0.4.5 dqrng_0.2.1 multtest_2.42.0 XVector_0.26.0 bibtex_0.4.2.2 rvest_0.3.5 digest_0.6.25 sctransform_0.2.1 RcppAnnoy_0.0.15 tsne_0.1-3 rmarkdown_2.1 cellranger_1.1.0 leiden_0.3.3 uwot_0.1.5 edgeR_3.28.1 DelayedMatrixStats_1.8.0 gtools_3.8.1 lifecycle_0.1.0 nlme_3.1-145 jsonlite_1.6.1 BiocNeighbors_1.4.2 viridisLite_0.3.0 limma_3.42.2 fansi_0.4.1 pillar_1.4.3 lattice_0.20-40 httr_1.4.1 plotrix_3.7-7 survival_3.1-8 glue_1.3.1 png_0.1-7 stringi_1.4.6 BiocSingular_1.2.2 caTools_1.18.0 irlba_2.3.3 future.apply_1.4.0 ape_5.3